##### ########################################################## ######
#####                                                            ######
#####   Input: debates/sentence-level dictionary scores          ######
#####   Output: various!                                         ######
#####                                                            ######
##### ########################################################## ######

rm(list=ls())
range01 <- function(x){(x-min(x))/(max(x)-min(x))}
testing <- FALSE

# Load libraries

library(quanteda) # CRAN v.3.0.0
library(data.table) # CRAN v.1.13.6
library(ggplot2) # CRAN v.3.3.3
library(rstan) # CRAN v. 2.21.2

vars <- c("anecdote_std", "affect_std", "posemo_std", "negemo_std", "fact_std", "aggression_std", "complexity_std", "repetition_std")
var_names <- c("Human Narrative", "Affect", "Positive Emotion", "Negative Emotion", "Fact", "Aggression", "Complexity", "Repetition")

# Load data

load("working/speech_scores.Rdata")

## Drop all debates with fewer than 5 participants

speech_scores[,keep:=.N > 4, by = section_id]
speech_scores <- speech_scores[keep == TRUE]

## Drop the speaker

speech_scores <- speech_scores[is_speaker == F]

## Drop all debates before Blair

speech_scores <- speech_scores[speech_scores$parliamentary_term != "1992-1997"]
speech_scores$parliamentary_term <- droplevels(speech_scores$parliamentary_term)
speech_scores$session <- factor(speech_scores$session)
session_labels <- levels(speech_scores$session)
session_labels <- gsub("-19|-20","-",substring(session_labels,3,10))

####  Model 2 - time analysis ####

load("working/stan_out/standata_model2.Rdata")

time_effect_plot <- function(title = "gender_effect_time", var_name = "alpha_gender_effect", model = "model2", plot_expectations = TRUE, ylims = NULL, ...){
  
pdf(paste0("analysis/plots/",title,".pdf"), width = 12, height = 6)
par(mfrow = c(2,4))

for(outcome in vars){
  
  load(paste0("working/stan_out/",model,"/",outcome,"_posterior.Rdata"))  
  
  
  params <- extract(posterior)
  
  gender_effect_est <- apply(params[[var_name]],2, quantile, 0.5)
  gender_effect_hi <- apply(params[[var_name]],2,quantile,0.975)
  gender_effect_lo <- apply(params[[var_name]],2,quantile,0.025)
  
  if(is.null(ylims)) ylims = range(c(gender_effect_lo, gender_effect_hi,gender_effect_lo*-1, gender_effect_hi*-1))
  
plot(1:length(gender_effect_est), gender_effect_est, 
     ylim = ylims, 
     xlab = "", main = var_names[vars %in% outcome], 
     pch = 19, xaxt = "n", bty = "n",...)

if(plot_expectations){
  if(outcome %in% c("affect_std", "posemo_std", "anecdote_std", "posemo2_std")){
    rect(xleft = 0, xright = length(session_labels) + 1, ytop = .2, ybottom = 0, col = alpha("grey",.3), border = NA)  
  }else{
    rect(xleft = 0, xright = length(session_labels) + 1, ybottom = -.2, ytop = 0, col = alpha("grey",.3), border = NA)  
  }
  
}

points(1:length(gender_effect_est), gender_effect_est, 
       pch = 19)

segments(1:length(gender_effect_est), 
         y0 = gender_effect_lo, 
         y1 = gender_effect_hi) 
if(plot_expectations) abline(h = 0)
axis(1, at = 1:length(session_labels), labels = session_labels, las = 2)


}

dev.off()

}

time_effect_plot(title = "model2_gender_effect_time", var_name = "alpha_gender_effect", model = "model2", ylab = "Female MP difference", ylims = c(-.2,.2))

time_effect_plot(title = "model2_gender_effect_time_control", var_name = "alpha_gender_effect", model = "model2_control", ylab = "Female MP difference", ylims = c(-.15,.15))

plot_average_styles <- function(outcome = "posemo2_std"){
  
  load(paste0("working/stan_out/model2/",outcome,"_posterior.Rdata"))  
  params <- extract(posterior)
  
  men_est <- apply(params[["alpha_intercept"]],2, quantile, .5)
  men_lo <- apply(params[["alpha_intercept"]],2,quantile,0.025)
  men_hi <- apply(params[["alpha_intercept"]],2,quantile,0.975)
  
  women_est <- apply(params[["alpha_intercept"]] + params[["alpha_gender_effect"]],2, quantile, .5)
  women_lo <- apply(params[["alpha_intercept"]] + params[["alpha_gender_effect"]],2, quantile,0.025)
  women_hi <- apply(params[["alpha_intercept"]] + params[["alpha_gender_effect"]],2, quantile,0.975)
  
  plot(1:length(women_est), women_est, 
       ylim = c(-.3, .2),
       xlab = "", main = var_names[vars %in% outcome], 
       pch = 19, xaxt = "n", bty = "n", ylab = "Average style use")
  segments(1:length(women_est), 
           y0 = women_lo, 
           y1 = women_hi) 
  points(1:length(men_est), men_est, pch = 1)
  segments(1:length(men_est), 
           y0 = men_lo, 
           y1 = men_hi,
           lty = 2) 
  
  if(outcome %in% c("affect_std", "posemo_std", "posemo2_std", "anecdote_std")) legend("topright", legend = c("Women", "Men"), pch = c(19,1), lty = c(1,2), bty = "n")
  if(outcome %in% c("negemo_std", "negemo2_std", "fact_std", "complexity_std")) legend("topleft", legend = c("Women", "Men"), pch = c(19,1), lty = c(1,2), bty = "n")
  if(outcome %in% c("aggression_std")) legend("bottomright", legend = c("Women", "Men"), pch = c(19,1), lty = c(1,2), bty = "n")
  if(outcome %in% c("repetition_std")) legend("bottomleft", legend = c("Women", "Men"), pch = c(19,1), lty = c(1,2), bty = "n")
  axis(1, at = 1:length(session_labels), labels = session_labels, las = 2)
}
    
pdf("analysis/plots/model2_average_style_use.pdf",12,6)  
par(mfrow = c(2,4))
for(i in 1:length(vars)) print(plot_average_styles(vars[i]))
dev.off()


####  Model 2 - drift vs replacement ####
## Uncertainty estimates

load("working/stan_out/standata_model2.Rdata")

get_replacement_socialisation_effects_chain <- function(v, ...){
  
  load(paste0("working/stan_out/model2/",vars[v],"_posterior.Rdata"))
  
  speech_scores$person_parl_id <- paste0(speech_scores$person_id, "_",speech_scores$session)
  
  speech_scores$person_parl_id <- as.numeric(as.factor(speech_scores$person_parl_id))
  speech_scores$person_id_new <- as.numeric(as.factor(as.character(speech_scores$person_id)))
  
  person_session_raw <- speech_scores[,list(person = unique(person_id_new), 
                             session = unique(session), 
                             gender = unique(gender)),by = person_parl_id]
  
  setkey(person_session_raw, person_parl_id)
  
  params <- extract(posterior)
  
  out_chain <- array(NA, c(nrow(params$alpha), 
                           length(unique(speech_scores$session))-1,
                           10))
  
  it<-1
  
  for(it in 1:nrow(params$alpha)){
  cat(".")
    tmp <- person_session_raw
    
    alpha_est <- params$alpha[it,]
  
    tmp$alpha <- alpha_est
  
    setkey(tmp, person, session)
  
  ## Loop over sessions
    out_list <- list()
    i<-4
    
    for(i in 2:length(levels(tmp$session))){
    
    data_t_1 <- tmp[session == levels(tmp$session)[i]]
    data_t_0 <- tmp[session == levels(tmp$session)[i-1]]
    
    data_t_1$in_previous <- data_t_1$person %in% data_t_0$person
    data_t_0$in_next <- data_t_0$person %in% data_t_1$person
    
    mean_remain_men_t_0 <- mean(data_t_0[in_next == TRUE & gender == "Male"]$alpha)
    mean_remain_men_t_1 <- mean(data_t_1[in_previous == TRUE & gender == "Male"]$alpha)
    
    mean_remain_women_t_0 <- mean(data_t_0[in_next == TRUE & gender == "Female"]$alpha)
    mean_remain_women_t_1 <- mean(data_t_1[in_previous == TRUE & gender == "Female"]$alpha)
    
    mean_leavers_men_t_0 <- mean(data_t_0[in_next == FALSE & gender == "Male"]$alpha)
    mean_leavers_women_t_0 <- mean(data_t_0[in_next == FALSE & gender == "Female"]$alpha)
    
    mean_joiners_men_t_1 <- mean(data_t_1[in_previous == FALSE & gender == "Male"]$alpha)
    mean_joiners_women_t_1 <- mean(data_t_1[in_previous == FALSE & gender == "Female"]$alpha)
    
    frac_remainers_men_t_0 <- mean(data_t_0[gender == "Male"]$in_next)
    frac_remainers_men_t_1 <- mean(data_t_1[gender == "Male"]$in_previous)
    
    frac_remainers_women_t_0 <- mean(data_t_0[gender == "Female"]$in_next)
    frac_remainers_women_t_1 <- mean(data_t_1[gender == "Female"]$in_previous)
    
    ## Remove NANs
    
    if(is.nan(mean_remain_men_t_0)) mean_remain_men_t_0 <- 0
    if(is.nan(mean_remain_men_t_1)) mean_remain_men_t_1 <- 0
    if(is.nan(mean_remain_women_t_0)) mean_remain_women_t_0 <- 0
    if(is.nan(mean_remain_women_t_1)) mean_remain_women_t_1 <- 0
    if(is.nan(mean_leavers_men_t_0)) mean_leavers_men_t_0 <- 0
    if(is.nan(mean_leavers_women_t_0)) mean_leavers_women_t_0 <- 0
    if(is.nan(mean_joiners_men_t_1)) mean_joiners_men_t_1 <- 0
    if(is.nan(mean_joiners_women_t_1)) mean_joiners_women_t_1 <- 0
    if(is.nan(frac_remainers_men_t_0)) frac_remainers_men_t_0 <- 0
    if(is.nan(frac_remainers_men_t_1)) frac_remainers_men_t_1 <- 0
    if(is.nan(frac_remainers_women_t_0)) frac_remainers_women_t_0 <- 0
    if(is.nan(frac_remainers_women_t_1)) frac_remainers_women_t_1 <- 0
    
    mean_men_t_0 <- (frac_remainers_men_t_0 * mean_remain_men_t_0) + 
      ((1 - frac_remainers_men_t_0) * mean_leavers_men_t_0)
    
    mean_women_t_0 <- (frac_remainers_women_t_0 * mean_remain_women_t_0) + 
      ((1 - frac_remainers_women_t_0) * mean_leavers_women_t_0)
    
    mean_men_t_1 <- (frac_remainers_men_t_1 * mean_remain_men_t_1) + 
      ((1 - frac_remainers_men_t_1) * mean_joiners_men_t_1)
    
    mean_women_t_1 <- (frac_remainers_women_t_1 * mean_remain_women_t_1) + 
      ((1 - frac_remainers_women_t_1) * mean_joiners_women_t_1)
    
    socialization_effect_men <- (frac_remainers_men_t_1 * mean_remain_men_t_1) - (frac_remainers_men_t_0 * mean_remain_men_t_0)
    
    socialization_effect_women <- (frac_remainers_women_t_1 * mean_remain_women_t_1) - (frac_remainers_women_t_0 * mean_remain_women_t_0)
    
    replacement_effect_men <- ((1 - frac_remainers_men_t_1) * mean_joiners_men_t_1) - ((1 - frac_remainers_men_t_0) * mean_leavers_men_t_0)
    
    replacement_effect_women <- ((1 - frac_remainers_women_t_1) * mean_joiners_women_t_1) - ((1 - frac_remainers_women_t_0) * mean_leavers_women_t_0)
    
    socialisation_effect_diff <-  socialization_effect_women - socialization_effect_men
    
    replacement_effect_diff <- replacement_effect_women - replacement_effect_men
    
    out_chain[it,i-1,] <- c(mean_men_t_0, mean_men_t_1, mean_women_t_0, mean_women_t_1, socialization_effect_men, socialization_effect_women, socialisation_effect_diff, replacement_effect_men, replacement_effect_women, replacement_effect_diff)
    
    
  }
  
  }
  
  return(out_chain)
  
}

out_chains <- lapply(1:length(vars), function(x) get_replacement_socialisation_effects_chain(x))

socialisation_effect_diffs_men <- do.call("rbind",lapply(out_chains,function(x) quantile(rowMeans(x[,,5]),c(0.025,.5,.975))))
replacement_effect_diffs_men <- do.call("rbind",lapply(out_chains,function(x) quantile(rowMeans(x[,,8]),c(0.025,.5,.975))))

socialisation_effect_diffs_women <- do.call("rbind",lapply(out_chains,function(x) quantile(rowMeans(x[,,6]),c(0.025,.5,.975))))
replacement_effect_diffs_women <- do.call("rbind",lapply(out_chains,function(x) quantile(rowMeans(x[,,9]),c(0.025,.5,.975))))

socialisation_effect_diffs_all <- do.call("rbind",lapply(out_chains,function(x) quantile(rowMeans(x[,,7]),c(0.025,.5,.975))))
replacement_effect_diffs_all <- do.call("rbind",lapply(out_chains,function(x) quantile(rowMeans(x[,,10]),c(0.025,.5,.975))))

xlims <- range(c(socialisation_effect_diffs_men, replacement_effect_diffs_men,
                 socialisation_effect_diffs_women, replacement_effect_diffs_women,
                 socialisation_effect_diffs_all, replacement_effect_diffs_all))



pdf("analysis/plots/model2_replacement_socialisation_decomposition_average_uncertainty.pdf", 15, 4.5)

shift <- .1
ylims <- range(c((length(vars):1)) - shift, (length(vars):1)+shift)
par(mfrow = c(1,3), mar = c(5,8,4,2)+0.1, cex = .85)

plot(x = socialisation_effect_diffs_men[,2], y = (length(vars):1)+shift, xlim = xlims, pch = 19, ylab = "", yaxt = "n", xlab = "Average change in male style over time", main = "Male MPs", ylim = ylims)
points(x = replacement_effect_diffs_men[,2], y = (length(vars):1)-shift, pch = 1)
abline(v = 0, lty = 3)
segments(x0 = socialisation_effect_diffs_men[,1], x1 = socialisation_effect_diffs_men[,3], y0 = (length(vars):1)+shift)
segments(x0 = replacement_effect_diffs_men[,1], x1 = replacement_effect_diffs_men[,3], y0 = (length(vars):1)-shift, lty =2)
axis(2, at = length(vars):1, labels = var_names, las = 2)
legend("topright", legend = c("Replacement", "Within-MP"), pch = c(1, 19), bty = "n" , cex = 1)    

plot(x = socialisation_effect_diffs_women[,2], y = (length(vars):1)+shift, xlim = xlims, pch = 19, ylab = "", yaxt = "n", xlab = "Average change in female style over time", main = "Female MPs", ylim = ylims)
points(x = replacement_effect_diffs_women[,2], y = (length(vars):1)-shift, pch = 1)
abline(v = 0, lty = 3)
segments(x0 = socialisation_effect_diffs_women[,1], x1 = socialisation_effect_diffs_women[,3], y0 = (length(vars):1)+shift)
segments(x0 = replacement_effect_diffs_women[,1], x1 = replacement_effect_diffs_women[,3], y0 = (length(vars):1)-shift, lty =2)
axis(2, at = length(vars):1, labels = var_names, las = 2)
legend("topright", legend = c("Replacement", "Within-MP"), pch = c(1, 19), bty = "n" , cex = 1)    

plot(x = socialisation_effect_diffs_all[,2], y = (length(vars):1)+shift, xlim = xlims, pch = 19, ylab = "", yaxt = "n", xlab = "Female effect - male effect", main = "Female MPs - Male MPs", ylim = ylims)
points(x = replacement_effect_diffs_all[,2], y = (length(vars):1)-shift, pch = 1)
abline(v = 0, lty = 3)
segments(x0 = socialisation_effect_diffs_all[,1], x1 = socialisation_effect_diffs_all[,3], y0 = (length(vars):1)+shift)
segments(x0 = replacement_effect_diffs_all[,1], x1 = replacement_effect_diffs_all[,3], y0 = (length(vars):1)-shift, lty =2)
axis(2, at = length(vars):1, labels = var_names, las = 2)
legend("topright", legend = c("Replacement", "Within-MP"), pch = c(1, 19), bty = "n" , cex = 1)    

dev.off()


####  Model 1 and 2 - covariate analysis ####

load("working/stan_out/standata_model2.Rdata")
get_beta_effects <- function(v = "affect_std"){
  
  load(paste0("working/stan_out/model2_control/",v,"_posterior.Rdata"))
  params <- extract(posterior)
  
  out <- data.frame(style = var_names[vars==v], var = metadata$labels_X_ind, t(apply(params$Beta,2,quantile, c(0.025, 0.5, 0.975))))
  
  names(out) <- c("style", "var", "lo", "est", "hi")
  
  return(out)
}

out <- do.call("rbind", lapply(vars, get_beta_effects))
out$var <- as.character(out$var)
out$var[out$var == "age"] <- "Age"
out$var[out$var == "frontbench"] <- "Frontbench"
out$var[out$var == "committee_chair"] <- "Com. Chair"
out$var[out$var == "Other"] <- "Other party"
out$var[out$var == "LibDem"] <- "Lib Dem"
out$var[out$var == "gov_mp"] <- "Government MP"
out$var[out$var == "ld_gov_mp"] <- "LD * Government MP"
out$var[out$var == "lab_gov_mp"] <- "Lab * Government MP"
out$var[out$var == "margin"] <- "Margin of Victory"
out$var[out$var == "business"] <- "Occ. (business)"
out$var[out$var == "political"] <- "Occ. (political)"
out$var[out$var == "professional"] <- "Occ. (professional)"
out$var[out$var == "manual"] <- "Occ. (manual)"
out$var[out$var == "has_degree"] <- "University"

plot_beta <- function(style = "Affect"){
  
  tmp <- out[out$style == style,]
  tmp$y <- length(tmp$var):1
  plot(tmp$est, tmp$y,
       xlim = range(c(tmp$lo, tmp$hi)),
       pch = 19,
       xlab = expression(beta),
       ylab = "",
       yaxt = "n", 
       main = style)
  axis(2, at = tmp$y, labels = tmp$var, las = 2)
  segments(x0 = tmp$lo, x1 = tmp$hi, y0 = tmp$y)
  abline(v = 0 , lty = 3)
  abline(h = tmp$y , lty = 3, col = alpha("black",.2))
  
}

pdf("analysis/plots/model2_covariate_effects.pdf",12,5)
par(mfrow = c(2,4), mar = c(5,8.5,2,2)+0.1)
for(style in unique(out$style)) plot_beta(style)
dev.off()

####  Model 2 - R-squared ####

load("working/stan_out/standata_model2.Rdata")

get_gender_r2_time <- function(outcome = "affect_std"){
  
  load(paste0("working/stan_out/model2/",outcome,"_posterior.Rdata"))
  params <- extract(posterior)
  
  ## R-squared
  
  alpha <- params$alpha
  
  mu_0 <- params$alpha_intercept
  mu_1 <- mu_0 + params$alpha_gender_effect
  
  fitted_alpha <- matrix(NA, nrow(alpha), ncol(alpha))
  for(i in 1:nrow(fitted_alpha)) fitted_alpha[i,] <- ifelse(standata$Ri_gender == 0, mu_0[i,standata$Ri_term], mu_1[i,standata$Ri_term])
  
  eps <- alpha - fitted_alpha
  
  r_squared <- rep(NA, standata$N_terms)
  
  for(y in 1:standata$N_terms) r_squared[y] <- 1- (mean(apply(eps[,standata$Ri_term==y],1,var)) / mean(apply(alpha[,standata$Ri_term==y],1,var)))
  
  return(data.frame(term = 1:standata$N_terms, r_squared))
}

out_list <- lapply(vars, function(x) get_gender_r2_time(x))
out <- do.call("cbind", out_list)[,seq(2,dim(do.call("cbind", out_list))[2],2)]
names(out) <- vars

r_squared_time <- out

save(r_squared_time, file = "working/r_squared_time.Rdata")

####  Model 2 -  Debate effect analyses ####

speech_scores$R_debate_id <- as.numeric(as.factor(as.character(speech_scores$section_id)))

debate_titles <- speech_scores[,list(title = unique(parent), fact_std = mean(fact_std), debate_type = unique(debate_type)),by = R_debate_id]

levels(debate_titles$debate_type) <- c("Other", "QT", "PMQ", "Procedural", "Opp/Backbench", "Legislation")

setkey(debate_titles, R_debate_id)

plot_debate_effects <- function(outcome = "affect_std"){

load(paste0("working/stan_out/model2/",outcome,"_posterior.Rdata"))

params <- extract(posterior)

debate_titles$y_tmp <- apply(params$delta,2,mean)
tmp_mod <- lm(y_tmp ~ debate_type - 1, debate_titles)

coefs <- coef(summary(tmp_mod))
coefs <- data.frame(coefs)
coefs$hi <- coefs$Estimate + 1.96*coefs$Std..Error
coefs$lo <- coefs$Estimate - 1.96*coefs$Std..Error

plot(coefs[,1],1:nrow(coefs), xlim = range(c(coefs$hi, coefs$lo)), pch = 19,
     main = var_names[vars == outcome], yaxt = "n", ylab = "", xlab = "Debate-type mean")
segments(x0 = coefs$lo, x1 = coefs$hi, y0 = 1:nrow(coefs))
axis(2, at = 1:nrow(coefs), gsub("debate_type","",rownames(coefs)), las = 2)
abline(h = 1:nrow(coefs), lty = 3, col = alpha("black", .2))

}

pdf("analysis/plots/model2_debate_effects.pdf", width = 10, height = 5)
par(mfrow = c(2,4), mar = c(5,7,4,2)+0.1)
for(i in 1:length(vars)) plot_debate_effects(vars[i])
dev.off()
